{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "      0                                                  1\n",
      "0   ham  Go until jurong point, crazy.. Available only ...\n",
      "1   ham                      Ok lar... Joking wif u oni...\n",
      "2  spam  Free entry in 2 a wkly comp to win FA Cup fina...\n",
      "3   ham  U dun say so early hor... U c already then say...\n",
      "4   ham  Nah I don't think he goes to usf, he lives aro...\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "df = pd.read_csv('./SMSSpamCollection', delimiter='\\t', header=None)\n",
    "print(df.head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of spam messages: 747\n",
      "Number of ham messages: 4825\n"
     ]
    }
   ],
   "source": [
    "print('Number of spam messages: %s' % df[df[0] == 'spam'][0].count())\n",
    "print('Number of ham messages: %s' % df[df[0] == 'ham'][0].count())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Predicted: ham, message: Now thats going to ruin your thesis!\n",
      "Predicted: ham, message: Ok...\n",
      "Predicted: ham, message: Its a part of checking IQ\n",
      "Predicted: spam, message: Ringtone Club: Gr8 new polys direct to your mobile every week !\n",
      "Predicted: ham, message: Talk sexy!! Make new friends or fall in love in the worlds most discreet text dating service. Just text VIP to 83110 and see who you could meet.\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.linear_model.logistic import LogisticRegression\n",
    "from sklearn.model_selection import train_test_split, cross_val_score\n",
    "\n",
    "X = df[1].values\n",
    "y = df[0].values\n",
    "X_train_raw, X_test_raw, y_train, y_test = train_test_split(X, y)\n",
    "vectorizer = TfidfVectorizer()\n",
    "X_train = vectorizer.fit_transform(X_train_raw)\n",
    "X_test = vectorizer.transform(X_test_raw)\n",
    "classifier = LogisticRegression()\n",
    "classifier.fit(X_train, y_train)\n",
    "predictions = classifier.predict(X_test)\n",
    "for i, prediction in enumerate(predictions[:5]):\n",
    "    print('Predicted: %s, message: %s' % (prediction, X_test_raw[i]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[4 1]\n",
      " [2 3]]\n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAQkAAAD3CAYAAAAOh6G5AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAF/xJREFUeJzt3X20XXV95/H3Jw8QIBGUUIEQjOXBjmXJU6SiU0vVIiAF\npgMjilosmpGOTxXbAaSATB3b6Zo+sLBiEAXUoVCUrhTDSqnVAVyAJDEgAcRgZQikhgAC4SEm937m\nj72vHi737LPvzTl3n3vP57XWXjl779/97d85ued7f0/7t2WbiIh2ZjRdgIjobwkSEVEpQSIiKiVI\nRESlBImIqJQgERGVEiQiphlJMyV9X9INY5zbUdI1ktZJukPSok75JUhETD8fA+5rc+4M4Enb+wN/\nDfxFp8wSJBomaSdJ/yTpKUn/sB35nCbpn7tZtqZI+k1JP2y6HFORpH2AdwBfbJPkRODK8vV1wFsl\nqSrPBImaJL1b0kpJmyVtkHSjpP/YhaxPBl4J7G77lIlmYvtrto/uQnl6SpIl7V+VxvYttl8zWWWa\nZv4G+BNguM35BcDDALa3AU8Bu1dlOKubpZuuJH0COBv4ELAC+DlwDEVUvnU7s38V8ED5HzbwJM0a\npM/i7b+9ix9/YqhW2lV3b1kLvNByaKntpSM7ko4HNtpeJemorhXSdraKDdgV2AycUpFmR4oI/mi5\n/Q2wY3nuKGA9cBawEdgAvL8892mKgLO1vMYZwIXAV1vyXgQYmFXunw78GHgG+DfgtJbjt7b83BuB\nOyn+UtwJvLHl3HeA/wF8t8znn4H5bd7bSPn/pKX8JwHHAQ8ATwDntqQ/ArgN+FmZ9hJgh/LczeV7\nebZ8v+9syf+/A/8OfGXkWPkz+5XXOKzc3xt4DDiq6d+NbmyHvW5Hb92wX60NWNnhd/Wz5Wf5k/Kz\nfK71d6lMswI4snw9C9gEqCrfNDc6OxKYA1xfkeZTwBuAQ4CDKb4o57Wc35Mi2CygCASfk/Ry2xcA\n/xO4xvZc25dXFUTSLsDFwLG251EEgjVjpHsF8M0y7e7AXwHflNRarXw38H7gV4AdgE9WXHpPis9g\nAXA+cBnwHuBw4DeBP5X06jLtEPBHwHyKz+6twB8C2H5zmebg8v1e05L/KyhqVUtaL2z7QYoA8lVJ\nOwNfBq60/Z2K8k4hZsjDtbaOOdnn2N7H9iLgVOBfbb9nVLJlwO+Xr08u01Te5Zkg0dnuwCZXV4FP\nAy6yvdH2YxQ1hPe2nN9ant9qeznFX9GJtrmHgYMk7WR7g+21Y6R5B/Aj21+xvc321cD9wO+2pPmy\n7QdsPw9cSxHg2tkKfMb2VuDvKQLA39p+prz+vRTBEdurbN9eXvcnwBeA36rxni6wvaUsz4vYvgxY\nB9wB7EURlKcFA8O41jZRki6SdEK5ezmwu6R1wEgzulL6JDp7HJjfoa28N/BQy/5D5bFf5DHqZ58D\n5o63ILaflfROir/6l0v6LnCW7fs7lGekTAta9v99HOV53PZIw3nkS/zTlvPPj/y8pAMpai6LgZ0p\nfsdWVb0v4DHbL3RIcxnFX8Eltrd0SDtlGLPV9fokxpVvUdP6Tvn6/JbjLwDj6iBPTaKz24AtFO3w\ndh6lqCqP2Lc8NhHPUny5RuzZetL2Ctu/Q/EX9X6KL0+n8oyU6ZEJlmk8Pk9RrgNsvww4F6gcYoPq\nP5OS5lL081wOXFg2p6aNXtcktleCRAe2n6Joh39O0kmSdpY0W9J5kp4oq20PAedJ2kPS/DL9Vyd4\nyTXAmyXtK2lX4JyRE5JeKenEsm9iC0WzZazG6nLgwHLYdlZZ+3gt8JIZeD0wD3ga2Czp14AzR53/\nKfCrNfJ5uaSNku4B/pai0+4DFH0tl3azwE0yMIRrbU1JkKjB9v+maL+dR9Gz/jDFl3cJxZdvPsWI\nw93AD4DVwJ9N8Fo3AdeUea3ixV/sGWU5HqXo8f8tXvolxPbjwPEUIyqPU4xMHG9700TKNE6fpOgU\nfYailnPNqPMXAldK+pmk/1KRz7MUw8zzyn9H3ucngMMkndbNQjep32sS6tCxGWOQdCRwoe23l/vn\nANj+bKMFm2bK+wpusH1Qw0XpmYMP3sErls+vlXavfTassr24x0V6idQkJuYXs9ZK63lxp2BEbcM1\nt6ZkdCOiQW64v6GOBImJeQRY2LK/D5MzchDTjA1b+ztGJEhM0J3AAeUsw0coZre9u9kixdQkhjqO\nEDcrfRITUE6M+jDFPPj7gGvbzHyMCZJ0NcUclddIWi/pjKbL1AsGhl1va0pqEhNUTq9e3nQ5pivb\n72q6DJOl32sSCRIRDSomUyVIRESFYSdIREQbqUlERCUjtnpm08WolNGN7SBpSedUsT2m+2c8UpOo\nszUlQWL7TOtf4D4xzT9jMeQZtbampLkR0aBiZar+/lvdV0Fi/itmetHC2U0Xo7Z9F8xi8cFz+nxS\n7Ys9cPfOnRP1kTnszMv0iin1Gb/As/zcW2q3D9JxOQ6LFs7meysWdk4YE/b2vauWsoxuuMPfqp3W\nVqNNiTr6KkhEDKLh1CQioh0jfu7+/hr2d+kiprl0XEZER0OZlh0R7RgxlJpERFQZzuhGRLRTTMtO\nkIiINqbCDV4JEhENsun7yVT9XbqIaU8M19w65iTNkfQ9SXdJWivp02OkOV3SY5LWlNsHOuWbmkRE\ng0xXaxJbgLfY3ixpNnCrpBtt3z4q3TW2P1w30wSJiIZ1q+PSxTM7N5e7s8ttu2+OS3MjokFGDLve\nVoekmZLWABuBm2zfMUay/yzpbknXSep4R2WCRETDhphRawPmS1rZsr1kQR7bQ7YPoXiq3BGSRj9s\n+Z+ARbZfB9wEXNmpfGluRDRonEOgm+o+Vdz2zyR9GzgGuKfl+OMtyb4I/K9OeaUmEdGg4gleM2pt\nnUjaQ9Ju5eudgN8B7h+VZq+W3RMonkBXKTWJiIZ1cWWqvYArJc2kqABca/sGSRcBK20vAz4q6QRg\nG/AEcHqnTBMkIhpkq2v3bti+Gzh0jOPnt7w+BzhnPPkmSEQ0rN9nXCZIRDSoWHQm60lERFtZCDci\nKhhyF2hEtDcy47KfJUhENCwL4UZEW8V6EqlJRESFNDcioq2iTyLNjYiokAcGR0RbRmwbzhBoRFTI\njMuIaCujGxHRUTouI6KtzLiMiI7SJxERbRXL1yVIREQ7zhBoRFTIojMR0VGaGxHR1lTok+jpAK2k\nYyT9UNI6SWf38loRU1U3H/PXCz2rSZRr/3+O4gEh64E7JS2zfW+vrhkx1Qz6PIkjgHW2fwwg6e+B\nE4EEiYgRhm0DPONyAfBwy/564Dd6eL2IKWcq9Ek03nFZPhl5CcC+CxovTsSk6/cg0ct6ziPAwpb9\nfcpjL2J7qe3FthfvsXt/TyqJ6LaRPol+7rjsZZC4EzhA0qsl7QCcCizr4fUipiRbtbam9Kx+b3ub\npA8DK4CZwJdsr+3V9SKmqoGecWl7ObC8l9eImMrs7vVJSJoD3AzsSPHdvs72BaPS7AhcBRwOPA68\n0/ZPqvJNT2FEo8TQcNda/VuAt9jeLGk2cKukG23f3pLmDOBJ2/tLOhX4C+CdVZn29wBtxADoVp+E\nC5vL3dnl5lHJTgSuLF9fB7xVUmXmCRIRDRqZJ1FzdGO+pJUt25LR+UmaKWkNsBG4yfYdo5L8Yv6S\n7W3AU8DuVWVMcyOiSS76JWraZHtxZXb2EHCIpN2A6yUdZPue7SliahIRDRtGtbbxsP0z4NvAMaNO\n/WL+kqRZwK4UHZhtJUhENMh0r09C0h5lDQJJO1HcXHn/qGTLgN8vX58M/KtdXZdJcyOiUV2dTbkX\ncGV5B/YM4FrbN0i6CFhpexlwOfAVSeuAJygmOVZKkIho2PBwd4KE7buBQ8c4fn7L6xeAU8aTb4JE\nRINsGp1yXUeCRETD+v0u0ASJiIaNYwi0EQkSEQ1LcyMi2jLN3gZeR4JERMP6vLWRIBHRKIO7NATa\nKwkSEQ2bss0NSS+r+kHbT3e/OBGDZyqPbqylaC61hrmRfQP79rBcEQNh5N6NftY2SNhe2O5cRHSJ\ngT4PErXuApV0qqRzy9f7SDq8t8WKGBx2va0pHYOEpEuA3wbeWx56Dri0l4WKGCiuuTWkzujGG20f\nJun7ALafKJ+jERHbTdNiCHSrpBmUsUzS7sBwT0sVMSimwF2gdfokPgd8HdhD0qeBWymW4Y6Ibpjq\nzQ3bV0laBbytPHTK9i6sGRGt+rsmUXfG5UxgK0U8y7qYEd3U55Op6oxufAq4Gtib4sng/0fSOb0u\nWMTAmOrNDeB9wKG2nwOQ9Bng+8Bne1mwiIEwTW7w2jAq3azyWER0Q583N6pu8PpriuI/AayVtKLc\nPxq4c3KKFzEA+nwItKomMTKCsRb4Zsvx28dIGxETpKlak7B9+WQWJGIgNdwpWUfHPglJ+wGfAV4L\nzBk5bvvAHpYrYkCo75sbdeY8XAF8mWLGx7HAtcA1PSxTxGDp8yHQOkFiZ9srAGw/aPs8imAREd0w\nXHNrSJ0h0C3lDV4PSvoQxaPL5/W2WBEDYposOvNHwC7AR4E3AR8E/qCXhYoYJHK9rWM+0kJJ35Z0\nr6S1kj42RpqjJD0laU25nT9WXq3q3OB1R/nyGX658ExEdEv3+hu2AWfZXi1pHrBK0k227x2V7hbb\nx9fNtGoy1fVUFN/279W9SF33ProHh194ZrezjRazlz/WdBGmvaGP3trIdW1voJwNbfsZSfcBC4DR\nQWJcqmoSl2xPxhFRzzgmU82XtLJlf6ntpWPmKS0CDgXuGOP0kZLuAh4FPml7bdVFqyZTfatTiSOi\nC+p3XG6yvbhTIklzKRaK+vgYz8dZDbzK9mZJxwH/CBxQlV/WhohokunqEKik2RQB4mu2v/GSy9lP\n295cvl4OzJY0vyrPBImIhnVxdEPA5cB9tv+qTZo9y3RIOoIiBjxelW/tZ4FK2tH2lrrpI6Km7o1u\nvIliBPIHktaUx86lfNqe7UuBk4EzJW0DngdOtauf6lHn3o0jKKLTrsC+kg4GPmD7IxN9JxHRoktB\nwvatdFgw0/YljHNQok5z42LgeMoqie27KB7WExHbqW5To8nbyes0N2bYfqhsxowY6lF5IgZPn0/L\nrhMkHi6bHJY0E/gI8EBvixUxQKb6ehLAmRRNjn2BnwL/Uh6LiC5Qnz8Pr869GxuBUyehLBGDp+H+\nhjrqjG5cxhgVIttLelKiiEEz1YMERfNixBzgPwEP96Y4EQNoqgcJ2y9aqk7SVygeGhwRXdDvzY2J\nTMt+NfDKbhckIvpTnT6JJ/llhWgGxcN6zu5loSIGSp/XJCqDRHkjyMEU61oCDHea5x0R4+D+HwKt\nbG6UAWG57aFyS4CI6LZpsKT+GkmH9rwkEQNITOF7NyTNsr2NYgmsOyU9CDxL8b5s+7BJKmPE9Nbn\n9fOqPonvAYcBJ0xSWSIGzxSfcSkonto1SWWJGExTOEjsIekT7U62Wx4rIsan30c3qoLETGAuHVa6\niYjtNIVrEhtsXzRpJYkYRA0Pb9bRsU8iInprKndcvnXSShExyKZqkLD9xGQWJGJQTeWaRERMhgSJ\niGin6SnXdSRIRDQtQSIiqqQmERHVEiQiolKfB4mJrHEZEd3SxWeBSloo6duS7pW0VtLHxkgjSRdL\nWifpbkkdl3xITSKiad2rSWwDzrK9WtI8YJWkm2zf25LmWOCAcvsN4PPlv22lJhHRMA3X2zqxvcH2\n6vL1M8B9wIJRyU4ErnLhdmA3SXtV5ZuaRETDxjG6MV/Sypb9pbaXjpmntIhiVbk7Rp1awIsfrrW+\nPLah3UUTJCKaNL67QDfZXtwpkaS5wNeBj9t+euKFKyRIRDSti6MbkmZTBIiv2f7GGEkeARa27O/D\nLx+ZMab0SUQ0qJurZZfPybkcuK9i5bhlwPvKUY43AE/ZbtvUgB7WJCR9CTge2Gj7oF5dJ2LK615N\n4k3Ae4EfSFpTHjsX2BfA9qXAcuA4YB3wHPD+Tpn2srlxBXAJcFUPrxEx5alLz7yyfSsdFosqH7D1\n38aTb8+ChO2byx7WiGhnCjzmLx2XEU3r82nZjQcJSUuAJQCz57684dJETL5+vwu08dEN20ttL7a9\neNacXZouTsTk6/MHBjdek4gYaFNgZaqe1SQkXQ3cBrxG0npJZ/TqWhFT2qDWJGy/q1d5R0wXI5Op\n+lmaGxEN03B/R4kEiYgmTfHH/EXEJMhkqoiolppERFRJx2VEtGegSzd49UqCRETD0icREW1lnkRE\nVLPT3IiIaqlJRES1BImIqJKaRES0ZyD3bkRElQyBRkS1jG5ERJX0SUREe7lVPCKqFDMu+ztKJEhE\nNC0dlxFRJTWJiGjP7vt5Eo0/nCdi0Mn1tlp5SV+StFHSPW3OHyXpKUlryu38TnmmJhHRtO42N64A\nLgGuqkhzi+3j62aYIBHRpC4/Vdz2zZIWdS/HNDcimjeypkSnrXuOlHSXpBsl/XqnxKlJRDSt/vd/\nvqSVLftLbS8d59VWA6+yvVnSccA/AgdU/UCCRETDxjEEusn24u25lu2nW14vl/R3kubb3tTuZxIk\nIppkYGjyhkAl7Qn81LYlHUHR5fB41c8kSEQ0SLirk6kkXQ0cRdE0WQ9cAMwGsH0pcDJwpqRtwPPA\nqXZ1ARIkIprWxSBh+10dzl9CMURaW4JERNMyLTsi2jK5wSsiquUGr4ioliAREW3ZMNzf7Y0EiYim\n9XeMSJCIaFr6JCKiWoJERLSVJ3iNz/Ob1m9a84WzHmq6HOMwH2h7Y0xf+kLTBRi3qfcZw6vqJ+36\nbeBd11dBwvYeTZdhPCSt3N678qLaQHzGCRIR0ZaBof4e3kiQiGiUwQkS09l4VwWK8Zv+n3GfNzey\nxuV26LR0mKShctnyeyT9g6SdJ3qtcin0G8rXJ0g6uyLtbpL+cALXuFDSJ+seH5XmCkknj+Nai9ot\n+95qAsuzTS0joxt1toYkSPTW87YPsX0Q8HPgQ60nVRj3/4HtZbb/vCLJbsC4g0Q0ZPIXwh2XBInJ\ncwuwf/kX9IeSrgLuARZKOlrSbZJWlzWOuQCSjpF0v6TVwO+NZCTpdEmXlK9fKen6cvXjuyS9Efhz\nYL+yFvOXZbo/lnSnpLslfbolr09JekDSrcBrOr0JSR8s87lL0tdH1Y7eJmllmd/xZfqZkv6y5dr/\ndXs/yGknQSIkzQKOBX5QHjoA+Dvbvw48C5wHvM32YcBK4BOS5gCXAb8LHA7s2Sb7i4H/a/tg4DBg\nLXA28GBZi/ljSUeX1zwCOAQ4XNKbJR0OnFoeOw54fY238w3bry+vdx9wRsu5ReU13gFcWr6HM4Cn\nbL++zP+Dkl5d4zqDwYahoXpbQ9Jx2Vs7SVpTvr4FuBzYG3jI9u3l8TcArwW+KwlgB+A24NeAf7P9\nIwBJXwWWjHGNtwDvA7A9BDwl6eWj0hxdbt8v9+dSBI15wPW2nyuvsazGezpI0p9RNGnmAitazl1r\nexj4kaQfl+/haOB1Lf0Vu5bXfqDGtQZDn3dcJkj01vO2D2k9UAaCZ1sPATeNXptQ0ot+bjsJ+Kzt\nF823lPTxCeR1BXCS7bsknU6x6OqI0b/tLq/9EdutwYRuP2VqSuvzIJHmRvNuB94kaX8ASbtIOhC4\nH1gkab8yXbsFTr8FnFn+7ExJuwLPUNQSRqwA/qClr2OBpF8BbgZOkrSTpHkUTZtO5gEbJM0GTht1\n7hRJM8oy/yrww/LaZ5bpkXSgpF1qXGdA1BzZaHB0IzWJhtl+rPyLfLWkHcvD59l+QNIS4JuSnqNo\nrswbI4uPAUslnQEMAWfavk3Sd8shxhvLfon/ANxW1mQ2A++xvVrSNcBdwEbgzhpF/lPgDuCx8t/W\nMv0/4HvAy4AP2X5B0hcp+ipWq7j4Y8BJ9T6dAWBwn0+mUocl9yOih3adtYePfFm9mLniyS+uauI+\nltQkIprW53+oEyQimjQyBNrHEiQiGuYshBsR7WXRmYioMgWWr8s8iYimebjeVoOkL0na2O4O2/Km\nwoslrSvvpTmsU54JEhENMuBh19pqugI4puL8sRTT4g+gmOb/+U4ZJkhENMnuak3C9s3AExVJTgSu\ncuF2YDdJe1XlmT6JiIZ5codAFwAPt+yvL49taPcDCRIRDXqGJ1f8i6+bXzP5HEkrW/aXTsbKXQkS\nEQ2yXdV/0AuPAAtb9vcpj7WVPomIwbIMeF85yvEGigWB2jY1IDWJiGlF0tUUa3zMl7QeuACYDWD7\nUmA5xSpk64DngPd3zDN3gUZElTQ3IqJSgkREVEqQiIhKCRIRUSlBIiIqJUhERKUEiYiolCAREZX+\nP5J3mS9q2rd8AAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0x7feeec3e6bd0>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from sklearn.metrics import confusion_matrix\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "y_test = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]\n",
    "y_pred = [0, 1, 0, 0, 0, 0, 0, 1, 1, 1]\n",
    "confusion_matrix = confusion_matrix(y_test, y_pred)\n",
    "print(confusion_matrix)\n",
    "plt.matshow(confusion_matrix)\n",
    "plt.title('Confusion matrix')\n",
    "plt.colorbar()\n",
    "plt.ylabel('True label')\n",
    "plt.xlabel('Predicted label')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
